Summary

PCA for wecare-only (b37) wes WECARE+NFE dataset w/o Danish:

  • Suggests but not excludes some same outliers as the previous analyses: P5_E09 and possibly P6_D05, P1_C12, P1_D08, P5_E02
  • Shows no irregularities in PCA analysis
  • Provides evidence for using top 2 PC-s in regression model

To clarify

An e-mail was sent to bigsnpr package author:

  • Why U-values (eigenvectors?) are numerically different from PC-s plotted by "scores" plot?
  • How can I plot lines +/- 6xSD on my "scores" plot?
  • I use eigenvectors in regression models to correct for population stratification; can I use U-values directly or should I scale it as you do in the "scores" plot?

The answer was qiock but poorly worded (intended to show off rather than to clarify):
https://github.com/privefl/bigstatsr/issues/128#issuecomment-768277757

References

Start section

# Time
Sys.time()
## [1] "2021-01-31 14:05:29 GMT"
# Memory
gc()
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 411961 22.1     856251 45.8         NA   641560 34.3
## Vcells 790279  6.1    8388608 64.0      16384  1768509 13.5
# Clean up
rm(list=ls())
graphics.off()

# Options
options(stringsAsFactors = F)

# Folders
base_folder <- "/Users/alexey/Documents"
project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021","s02_wes_wecare_nfe")
scripts_folder <- file.path(project_folder,"scripts","s07_relatedness_and_pca")
setwd(scripts_folder)
data_folder <- file.path(project_folder,"data","s07_relatedness_and_pca")

# Libraries
library(bigsnpr) # for bed_autoSVD() and bed()
## Loading required package: bigstatsr
library(bigutilsr) # for prob_dist() and tukey_mc_up() for outlier detection
library(hexbin) # for plotting svd loadings
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
NCORES <- 1
#NCORES <- nb_cores() # 2

Read bed-bim-fam file-set

# Location of bed file
bed_file <- file.path(data_folder,"s03_non_related","common_biallelic_autosomal_snps_in_HWE_norel.bed")

# Attach PLINK data to R environment
wecare.bed <- bed(bed_file) # bigsnpr::bed

# Explore wecare.bed
wecare.bed
## A 'bed' object with 531 samples and 18628 variants.
names(wecare.bed)
##  [1] ".->bedfile"   "light"        ".self"        ".->.map"      "show"         "extptr"       "nrow"         "bimfile"      ".refClassDef" "bedfile"      "ncol"         "initialize"   "map"          "fam"          ".map"         ".->extptr"    "famfile"      ".fam"         "address"      ".->.fam"      "prefix"
#attributes(wecare.bed)
#str(wecare.bed)
#wecare.bed$bedfile
#wecare.bed$address

# Clean-up
rm(bed_file)

Phenotypes

# Phenotypes from plink
wecare_fam.df <- wecare.bed$fam
dim(wecare_fam.df)
## [1] 531   6
head(wecare_fam.df)
##   family.ID sample.ID paternal.ID maternal.ID sex affection
## 1   HG00097   HG00097           0           0   2        -9
## 2   HG00099   HG00099           0           0   2        -9
## 3   HG00100   HG00100           0           0   2        -9
## 4   HG00102   HG00102           0           0   2        -9
## 5   HG00106   HG00106           0           0   2        -9
## 6   HG00110   HG00110           0           0   2        -9
table(wecare_fam.df$affection, useNA = "always")
## 
##   -9    1    2 <NA> 
##  196  172  163    0
# Phenotypes from the main dataset file
load(file.path(project_folder,"data","s06_qc_filters","s04_filter_by_sample_call_rates.RData"))
rm(genotypes.mx,variants.df)

# Update folders
base_folder <- "/Users/alexey/Documents"
project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021","s02_wes_wecare_nfe")
scripts_folder <- file.path(project_folder,"scripts","s07_relatedness_and_pca")
data_folder <- file.path(project_folder,"data","s07_relatedness_and_pca")

dim(phenotypes.df)
## [1] 532  40
str(phenotypes.df)
## 'data.frame':    532 obs. of  40 variables:
##  $ wes_id        : chr  "HG00097" "HG00099" "HG00100" "HG00102" ...
##  $ gwas_id       : chr  NA NA NA NA ...
##  $ merged_id     : chr  NA NA NA NA ...
##  $ filter        : chr  "pass" "pass" "pass" "pass" ...
##  $ cc            : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ setno         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ registry      : chr  NA NA NA NA ...
##  $ family_history: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_dx        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_ref       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ rstime        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig1_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig2_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig3_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig4_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig5_gwas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stage         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ er            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ pr            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hist_cat      : chr  NA NA NA NA ...
##  $ hormone       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ chemo_cat     : chr  NA NA NA NA ...
##  $ br_xray       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ br_xray_dose  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_menarche  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_1st_ftp   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ age_menopause : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ num_preg      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_age18     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_dx        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bmi_ref       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig1_wecare   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig2_wecare   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig3_wecare   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig4_wecare   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eig5_wecare   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Batch_ID      : chr  NA NA NA NA ...
##  $ Subject_ID    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Barcode       : chr  NA NA NA NA ...
##  $ danish        : logi  NA NA NA NA NA NA ...
table(phenotypes.df$cc, useNA = "always")
## 
##   -1    0    1 <NA> 
##  197  172  163    0
table(phenotypes.df$filter, useNA = "always")
## 
## eigenvectors_outlier                 pass                 <NA> 
##                    2                  530                    0
#phenotypes.df[phenotypes.df$filter=="eigenvectors_outlier","wes_id"]
#"Possibly_related_to_P1-D05" -> phenotypes.df[phenotypes.df$wes_id=="P5_C12","filter"]

# Merge fam-file and phenotypes from the main dataset (removing samples that are not in fam-file) 
joined_phenotypes.df <- left_join(wecare_fam.df, phenotypes.df,
                                  by=c("sample.ID"="wes_id"))
dim(joined_phenotypes.df)
## [1] 531  45
colnames(joined_phenotypes.df)
##  [1] "family.ID"      "sample.ID"      "paternal.ID"    "maternal.ID"    "sex"            "affection"      "gwas_id"        "merged_id"      "filter"         "cc"             "setno"          "registry"       "family_history" "age_dx"         "age_ref"        "rstime"         "eig1_gwas"      "eig2_gwas"      "eig3_gwas"      "eig4_gwas"      "eig5_gwas"      "stage"          "er"             "pr"             "hist_cat"       "hormone"        "chemo_cat"      "br_xray"        "br_xray_dose"   "age_menarche"   "age_1st_ftp"    "age_menopause"  "num_preg"       "bmi_age18"      "bmi_dx"         "bmi_ref"        "eig1_wecare"    "eig2_wecare"    "eig3_wecare"    "eig4_wecare"    "eig5_wecare"    "Batch_ID"       "Subject_ID"     "Barcode"        "danish"
table(joined_phenotypes.df$filter, useNA = "always")
## 
## eigenvectors_outlier                 pass                 <NA> 
##                    2                  529                    0
table(joined_phenotypes.df$cc, useNA = "always")
## 
##   -1    0    1 <NA> 
##  196  172  163    0
table(joined_phenotypes.df$affection, useNA = "always")
## 
##   -9    1    2 <NA> 
##  196  172  163    0
#sum(joined_phenotypes.df$sample.ID == "P5_C12")

# Make sure that dplyr::left_joint hasnt changed the order of samples
sum(substr(joined_phenotypes.df$merged_id,1,6) != wecare_fam.df$sample.ID,na.rm = T)
## [1] 0
# Add column for PCA outliers ?
joined_phenotypes.df <- data.frame(joined_phenotypes.df, pca_outlier=F)

# Clean-up
rm(phenotypes.df, wecare_fam.df)

Variants

Data from bed-bim-fam only: explored; is it needed for calculation??

# map file
wecare_map.df <- wecare.bed$map
dim(wecare_map.df)
## [1] 18628     6
head(wecare_map.df)
##   chromosome    marker.ID genetic.dist physical.pos allele1 allele2
## 1          1 1_881627_G_A            0       881627       G       A
## 2          1 1_897325_G_C            0       897325       G       C
## 3          1 1_897738_C_T            0       897738       T       C
## 4          1 1_982941_T_C            0       982941       T       C
## 5          1 1_982994_T_C            0       982994       T       C
## 6          1 1_987200_C_T            0       987200       C       T
# make simple counts 
wecare_maf.df <- bed_MAF(wecare.bed)
dim(wecare_maf.df)
## [1] 18628     5
head(wecare_maf.df)
##    ac mac         af        maf   N
## 1 326 326 0.33887734 0.33887734 481
## 2  63  63 0.06745182 0.06745182 467
## 3  60  60 0.06479482 0.06479482 463
## 4  93  93 0.09011628 0.09011628 516
## 5  92  92 0.08829175 0.08829175 521
## 6 104 104 0.09942639 0.09942639 523
# merge map file with the counts
joined_variants.df <- cbind(wecare_map.df,wecare_maf.df)
dim(joined_variants.df)
## [1] 18628    11
head(joined_variants.df)
##   chromosome    marker.ID genetic.dist physical.pos allele1 allele2  ac mac         af        maf   N
## 1          1 1_881627_G_A            0       881627       G       A 326 326 0.33887734 0.33887734 481
## 2          1 1_897325_G_C            0       897325       G       C  63  63 0.06745182 0.06745182 467
## 3          1 1_897738_C_T            0       897738       T       C  60  60 0.06479482 0.06479482 463
## 4          1 1_982941_T_C            0       982941       T       C  93  93 0.09011628 0.09011628 516
## 5          1 1_982994_T_C            0       982994       T       C  92  92 0.08829175 0.08829175 521
## 6          1 1_987200_C_T            0       987200       C       T 104 104 0.09942639 0.09942639 523
# Variants with AF(ref) < AF(alt)
inverted <- joined_variants.df$ac != joined_variants.df$mac
sum(inverted)
## [1] 5
joined_variants.df[inverted,]
##      chromosome       marker.ID genetic.dist physical.pos allele1 allele2  ac mac        af       maf   N
## 2594          2 2_101010082_G_C            0    101010082       G       C 513 511 0.5009766 0.4990234 512
## 3101          2 2_215855780_G_A            0    215855780       G       A 527 525 0.5009506 0.4990494 526
## 6368          6  6_28542520_C_T            0     28542520       C       T 510 508 0.5009823 0.4990177 509
## 7057          6  6_97613163_T_C            0     97613163       C       T 524 522 0.5009560 0.4990440 523
## 7431          6 6_166579270_C_T            0    166579270       C       T 512 510 0.5009785 0.4990215 511
# Clean-up
rm(wecare_map.df, wecare_maf.df, inverted)

PCA

Could be sequentially repeated if outliers are detected

1st round

Takes care about LD etc.
See ?plot.big_SVD for plotting svd objets.

# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers1 <- which(!joined_phenotypes.df$pca_outlier)
length(non_outliers1)
## [1] 531
# bigsnpr::bed_autoSVD, Default k = 10
#using non-outlier samples (ind.row) and all variants (ind.col) 
#table(wecare.bed$map$chromosome) - if complains abotut non-numeric chromosomes
wecare.svd1 <- bed_autoSVD(wecare.bed, 
                          ind.row=non_outliers1, 
                          ncores = NCORES) 
## 
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 9707 variants.
## Discarding 0 variant with MAC < 10.
## 
## Iteration 1:
## Computing SVD..
## 132 outlier variants detected..
## 1 long-range LD region detected..
## 
## Iteration 2:
## Computing SVD..
## 0 outlier variant detected..
## 
## Converged!
#ind.col=vars_not_in_LD, 

# Variants not in LD (detected by clumping during autoSVD)
vars_not_in_LD1 <- attr(wecare.svd1, "subset")
length(vars_not_in_LD1)
## [1] 9575
#attributes(wecare.svd)
str(wecare.svd1)
## List of 7
##  $ d     : num [1:10] 159 140 126 120 120 ...
##  $ u     : num [1:531, 1:10] 0.01862 0.04007 0.04438 0.04531 0.00228 ...
##  $ v     : num [1:9575, 1:10] -0.00744 -0.00137 -0.00804 -0.00855 0.00478 ...
##  $ niter : num 16
##  $ nops  : num 248
##  $ center: num [1:9575] 0.678 0.135 0.199 0.198 0.425 ...
##  $ scale : num [1:9575] 0.669 0.355 0.423 0.422 0.579 ...
##  - attr(*, "class")= chr "big_SVD"
##  - attr(*, "subset")= int [1:9575] 1 2 6 7 9 11 12 13 15 16 ...
##  - attr(*, "lrldr")='data.frame':    1 obs. of  3 variables:
##   ..$ Chr  : num 6
##   ..$ Start: num 33648228
##   ..$ Stop : num 55266625
# Eigenvalues
length(wecare.svd1$d)
## [1] 10
wecare.svd1$d
##  [1] 159.4512 139.5095 126.1602 120.4182 120.0438 119.5141 119.2688 119.1245 118.7687 118.3971
plot(wecare.svd1) # default type="screeplot" see ?plot.big_SVD  
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?

# Loadings
dim(wecare.svd1$v)
## [1] 9575   10
head(wecare.svd1$v)
##              [,1]         [,2]          [,3]         [,4]         [,5]         [,6]         [,7]         [,8]         [,9]        [,10]
## [1,] -0.007442316  0.003834898 -0.0004928298 -0.011781062 -0.016935819  0.004184202  0.003527457 -0.006785065 -0.012506751  0.018771114
## [2,] -0.001368639  0.004052780 -0.0082892737  0.002937288 -0.010744252  0.013751571  0.002355065 -0.005999013 -0.024960317  0.008184192
## [3,] -0.008036731 -0.001822211 -0.0116159936 -0.005933570  0.003520575 -0.027334667 -0.001172901  0.005634025  0.001903181 -0.002592940
## [4,] -0.008545381  0.005182089  0.0117814320  0.001751618  0.015245286 -0.021063106  0.004673890 -0.012092966  0.011323376  0.002888324
## [5,]  0.004776277 -0.023937956 -0.0073829825  0.001888633  0.006850649  0.004019456 -0.007861765 -0.004675818  0.005442361 -0.003333888
## [6,] -0.011186694 -0.021340043  0.0232951518  0.007873729 -0.000140632 -0.011679733 -0.003482157 -0.015028438 -0.002562494 -0.014964663
# Loadings summary (for PCs from 1 to 10)
plot(wecare.svd1,type="loadings",loadings=1:10,coeff=0.4)

# Eigenvectors
dim(wecare.svd1$u)
## [1] 531  10
head(wecare.svd1$u)
##             [,1]         [,2]         [,3]         [,4]         [,5]        [,6]         [,7]         [,8]         [,9]        [,10]
## [1,] 0.018622738  0.019854709  0.021727834  0.040897274 -0.023522635 -0.12298425  0.013204443  0.016949692  0.052266173  0.005252450
## [2,] 0.040068650 -0.030401205 -0.007037642 -0.005381513  0.063195958 -0.03780878 -0.004445595 -0.058644607  0.013258496  0.056564389
## [3,] 0.044375826  0.005365033  0.005296443 -0.041839396  0.007915045 -0.03751976  0.015726508  0.005636768  0.048383012  0.020846493
## [4,] 0.045311987  0.022983335  0.018884413  0.028688205 -0.017291190 -0.08735035  0.056054534  0.044640043  0.013983675 -0.022166526
## [5,] 0.002278271  0.006678138  0.048083894 -0.010012405 -0.025999349  0.03106204  0.093311736 -0.080248583 -0.009133683  0.049155780
## [6,] 0.036247260  0.007615541  0.087170557  0.087995074 -0.004003836  0.03703239  0.028481750 -0.012679235 -0.115985155 -0.009126424
# PCA summary (for PCs from 1 to 10)
plot(wecare.svd1,type = "scores",scores=1:10,coeff=0.4)

# Calculate a measure of "outlieness"  
U1 <- wecare.svd1$u
prob1 <- prob_dist(U1, ncores=NCORES) # bigutilsr::prob_dist
S1 <- prob1$dist.self / sqrt(prob1$dist.nn)
tukey_threshold1 <- tukey_mc_up(S1) # bigutilsr::tukey_mc_up

# Outliers
outliers1 <- S1 >= tukey_threshold1
sum(outliers1)
## [1] 3
outliers_id1 <- wecare.bed$fam$sample.ID[S1 >= tukey_threshold1]
outliers_id1
## [1] "P1_E06" "P2_C12" "P5_E09"
# Histogram by "outlieness" score
ggplot() +
  geom_histogram(aes(S1), color = "black", fill = "blue", alpha = 0.3) +
  theme_bigstatsr() +
  geom_vline(xintercept=tukey_threshold1, colour="red") +
  labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Location of outlier(s) in PCA plots
plot(U1[, 1:2], col = (S1 > tukey_threshold1) + 1, pch = 20)

plot(U1[, 3:4], col = (S1 > tukey_threshold1) + 1, pch = 20)

plot(U1[, 5:6], col = (S1 > tukey_threshold1) + 1, pch = 20)

# Add outlier to the phenotypes data frame
joined_phenotypes.df$pca_outlier <- joined_phenotypes.df$pca_outlier | 
  joined_phenotypes.df$sample.ID %in% outliers_id1

sum(joined_phenotypes.df$pca_outlier)
## [1] 3
# Clean-up
rm(non_outliers1,U1,prob1,S1,tukey_threshold1,
   outliers1, wecare.svd1, outliers_id1) #vars_not_in_LD1, 

2nd round

Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up
https://privefl.github.io/bigsnpr/articles/bedpca.html

# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers2 <- which(!joined_phenotypes.df$pca_outlier)
length(non_outliers2)
## [1] 528
# Calculate PCA
wecare.svd2 <- bed_autoSVD(wecare.bed, 
                          ind.row=non_outliers2, 
                          ncores = NCORES) 
## 
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 9704 variants.
## Discarding 0 variant with MAC < 10.
## 
## Iteration 1:
## Computing SVD..
## 131 outlier variants detected..
## 1 long-range LD region detected..
## 
## Iteration 2:
## Computing SVD..
## 0 outlier variant detected..
## 
## Converged!
# ind.col=vars_not_in_LD1 - removes the outlier 

# Variants not in LD (detected by clumping during autoSVD)
vars_not_in_LD2 <- attr(wecare.svd2, "subset")
length(vars_not_in_LD2)
## [1] 9573
length(vars_not_in_LD1)
## [1] 9575
# Explore PCA results
plot(wecare.svd2)
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?

plot(wecare.svd2, type = "loadings", loadings=1:10, coeff=0.4)

plot(wecare.svd2, type = "scores", scores=1:10, coeff=0.4)

# Calculate a measure of "outlieness"  
U2 <- wecare.svd2$u
prob2 <- prob_dist(U2, ncores=NCORES) # bigutilsr::prob_dist
S2 <- prob2$dist.self / sqrt(prob2$dist.nn)
tukey_threshold2 <- tukey_mc_up(S2) # bigutilsr::tukey_mc_up

# Outliers
outliers2 <- S2 >= tukey_threshold2
sum(outliers2)
## [1] 0
#outliers_id2 <- wecare.bed$fam$sample.ID[S2 >= tukey_threshold2]
#outliers_id2

# Histogram by "outlieness" score
ggplot() +
  geom_histogram(aes(S2), color = "black", fill = "blue", alpha = 0.3) +
  theme_bigstatsr() +
  geom_vline(xintercept=tukey_threshold2, colour="red") +
  labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Location of outlier(s) in PCA plots
#plot(U2[, 1:2], col = (S2 > tukey_threshold2) + 1, pch = 20)
#plot(U2[, 3:4], col = (S2 > tukey_threshold2) + 1, pch = 20)
#plot(U2[, 5:6], col = (S2 > tukey_threshold2) + 1, pch = 20)

# Add outlier to the phenotypes data frame
#joined_phenotypes.df$pca_outlier <- joined_phenotypes.df$pca_outlier | 
#  joined_phenotypes.df$sample.ID %in% outliers_id2

#sum(joined_phenotypes.df$pca_outlier)

# Clean-up
rm(non_outliers2,U2,prob2,S2,tukey_threshold2,outliers2,
   vars_not_in_LD1,vars_not_in_LD2) # vars_not_in_LD2, outliers_id2,wecare.svd2 

3rd round

Omitted because of no outliers

Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up

https://privefl.github.io/bigsnpr/articles/bedpca.html

Update phenotypes table

phenotypes_with_PCs.df <- joined_phenotypes.df[!joined_phenotypes.df$pca_outlier,]
dim(phenotypes_with_PCs.df)
## [1] 528  46
#eigenvectors.mx <- wecare.svd3$u
eigenvectors.mx <- wecare.svd2$u
dim(eigenvectors.mx)
## [1] 528  10
colnames(eigenvectors.mx) <- 
  c("pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10")

phenotypes_with_PCs.df <- cbind(phenotypes_with_PCs.df, eigenvectors.mx)
dim(phenotypes_with_PCs.df)
## [1] 528  56
colnames(phenotypes_with_PCs.df)
##  [1] "family.ID"      "sample.ID"      "paternal.ID"    "maternal.ID"    "sex"            "affection"      "gwas_id"        "merged_id"      "filter"         "cc"             "setno"          "registry"       "family_history" "age_dx"         "age_ref"        "rstime"         "eig1_gwas"      "eig2_gwas"      "eig3_gwas"      "eig4_gwas"      "eig5_gwas"      "stage"          "er"             "pr"             "hist_cat"       "hormone"        "chemo_cat"      "br_xray"        "br_xray_dose"   "age_menarche"   "age_1st_ftp"    "age_menopause"  "num_preg"       "bmi_age18"      "bmi_dx"         "bmi_ref"        "eig1_wecare"    "eig2_wecare"    "eig3_wecare"    "eig4_wecare"    "eig5_wecare"    "Batch_ID"       "Subject_ID"     "Barcode"        "danish"         "pca_outlier"    "pc1"            "pc2"            "pc3"            "pc4"            "pc5"            "pc6"            "pc7"            "pc8"            "pc9"            "pc10"
# Check consistency with the previous eigenvectors from WES
sum(is.na((phenotypes_with_PCs.df$eig1_wecare)))
## [1] 196
plot(phenotypes_with_PCs.df$eig1_wecare,
     phenotypes_with_PCs.df$pc1,main="PC1: new vs old WES")

plot(phenotypes_with_PCs.df$eig2_wecare,
     phenotypes_with_PCs.df$pc2,main="PC2: new vs old WES")

# Check consistency with the previous eigenvectors from GWAS
sum(is.na((phenotypes_with_PCs.df$eig1_gwas)))
## [1] 196
plot(phenotypes_with_PCs.df$eig1_gwas,
     phenotypes_with_PCs.df$pc1,main="PC1: new WES vs GWAs")

plot(phenotypes_with_PCs.df$eig2_gwas,
     phenotypes_with_PCs.df$pc2,main="PC2: new WES vs GWAs")

# Clean-up
rm(joined_phenotypes.df, eigenvectors.mx) # vars_not_in_LD1
#somehow close wecare.bed ?

Calculate PC-outliers by SD

P6_D05 and P5_E09 are clear outliers when projected to 1kgp

6x sd

sd_threshold <- 6

# PC1 outliers
pc1 <- phenotypes_with_PCs.df$pc1
pc1_mean <- mean(pc1)
pc1_sd  <- sd(pc1)
lo_pc1 <- pc1 < pc1_mean - sd_threshold * pc1_sd
hi_pc1 <- pc1 > pc1_mean + sd_threshold * pc1_sd

cat("pc1 lo/hi:",sum(lo_pc1),"/",sum(hi_pc1),"\n")
## pc1 lo/hi: 0 / 0
phenotypes_with_PCs.df$sample.ID[lo_pc1]
## character(0)
phenotypes_with_PCs.df$sample.ID[hi_pc1]
## character(0)
# PC2 outliers
pc2 <- phenotypes_with_PCs.df$pc2
pc2_mean <- mean(pc2)
pc2_sd  <- sd(pc2)
lo_pc2 <- pc2 < pc2_mean - sd_threshold * pc2_sd
hi_pc2 <- pc2 > pc2_mean + sd_threshold * pc2_sd

cat("pc2 lo/hi:",sum(lo_pc2),"/",sum(hi_pc2),"\n")
## pc2 lo/hi: 0 / 5
phenotypes_with_PCs.df$sample.ID[lo_pc2]
## character(0)
phenotypes_with_PCs.df$sample.ID[hi_pc2]
## [1] "P1_C05" "P1_C12" "P1_D08" "P5_E02" "P6_D05"
# PC3 outliers
pc3 <- phenotypes_with_PCs.df$pc3
pc3_mean <- mean(pc3)
pc3_sd  <- sd(pc3)
lo_pc3 <- pc3 < pc3_mean - sd_threshold * pc3_sd
hi_pc3 <- pc3 > pc3_mean + sd_threshold * pc3_sd

cat("pc3 lo/hi:",sum(lo_pc3),"/",sum(hi_pc3),"\n")
## pc3 lo/hi: 0 / 0
phenotypes_with_PCs.df$sample.ID[lo_pc3]
## character(0)
phenotypes_with_PCs.df$sample.ID[hi_pc3]
## character(0)
# Clean-up
rm(sd_threshold,
   pc1, pc1_mean, pc1_sd, lo_pc1, hi_pc1,
   pc2, pc2_mean, pc2_sd, lo_pc2, hi_pc2,
   pc3, pc3_mean, pc3_sd, lo_pc3, hi_pc3)

Detailed PCA plots

plot(wecare.svd2, type = "scores") +
     aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
     labs(title = NULL, color = "Case") 

# + geom_hline(pc2_mean ... - sd_threshold * pc2_sd,
#              linetype="dashed", color = "red")

plot(wecare.svd2, type = "scores", scores=3:4) +
     aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
     labs(title = NULL, color = "Case")

plot(wecare.svd2, type = "scores", scores=5:6) +
     aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
     labs(title = NULL, color = "Case")

plot(wecare.svd2, type = "scores", scores=7:8) +
     aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
     labs(title = NULL, color = "Case")

plot(wecare.svd2, type = "scores", scores=9:10) +
     aes(color = as.factor(phenotypes_with_PCs.df$affection)) +
     labs(title = NULL, color = "Case")

Save results

save.image(file.path(data_folder,"s04_calculate_PCs.RData"))
save(phenotypes_with_PCs.df,file=file.path(data_folder,"s04_phenotypes_with_PCs.RData"))

End section

ls()
## [1] "base_folder"            "data_folder"            "joined_variants.df"     "NCORES"                 "phenotypes_with_PCs.df" "project_folder"         "scripts_folder"         "wecare.bed"             "wecare.svd2"
Sys.time()
## [1] "2021-01-31 14:06:07 GMT"
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2135110 114.1    6058580 323.6         NA  6058580 323.6
## Vcells 4587992  35.1   28599140 218.2      16384 69822102 532.8